## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## here() starts at /Users/Jo/OneDrive/1_Hertie Studies/Thesis/Hertie-Thesis-Mehler
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
##
## Attaching package: 'rstatix'
##
## The following object is masked from 'package:stats':
##
## filter
theme_set(theme_bw())
data <- read_csv(here("data", "data_combined.csv"),
col_types = cols(
academic_status = col_factor(levels = c("non-academic", "academic")),
educ_cat = col_factor(levels = c("Low", "Intermediate", "High")),
gender = col_factor(levels = c("Male", "Female", "Other")),
monority_cat = col_factor(levels = c("Non-minority", "Minority")),
age_cat = col_factor(levels = c("18-29", "30-49", "50-69", "70+")),
polinterest_cat_3 = col_factor(levels = c("Low", "Intermediate", "High")),
empathy_pc_cat = col_factor(levels = c("Less empathetic", "More empathetic")),
exp_hate_speech_cat = col_factor(levels = c("Less experience", "More experience")),
exp_hostile_engagement_cat = col_factor(levels = c("Less experience", "More experience")),
cluster = col_factor(levels = c("Intent-based", "Harms-based", "Harms-based narrow", "Denying/Nonsense")),
leftright_cat = col_factor(levels = c("liberal/left", "moderate/center", "conservative/right")),
commitment_cat = col_factor(levels = c("Below average", "Above average"))
))## Warning: The following named parsers don't match the column names: monority_cat
# new try
p <- ggpairs(data, # Data frame
columns = indicators, # Columns
aes(alpha = 0.5),
upper = list(continuous = wrap("cor", size = 6),
continuous = "smooth"),
diag = list(mapping = aes(color = academic_status)),
lower = list(
mapping = aes(color = academic_status),
continuous = "smooth",
combo = "facetdensity")
)
p + theme(
text = element_text(size = 16),
axis.text.x = element_text(angle = 45, hjust = 1)
)## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 223 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 223 rows containing missing values
## Warning: Removed 223 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 223 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 223 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 223 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 223 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 223 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 223 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 223 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 223 rows containing missing values
## Warning: Removed 223 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 223 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 223 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 223 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 223 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 223 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 223 rows containing non-finite outside the scale range
## (`stat_density()`).
num_academics <- data %>% filter(academic_status == "academic") %>% nrow()
num_non_academics <- data %>% filter(academic_status == "non-academic") %>% nrow()
data_cluster <- data %>%
select(academic_status, cluster) %>%
group_by(academic_status, cluster) %>%
count() %>%
mutate(share = ifelse(academic_status == "academic", n/num_academics, n/num_non_academics)) %>%
# arrange(cluster) %>%
ungroup() %>%
# Create an ordering variable based on descending shares within each academic_status
arrange(academic_status, desc(share)) %>%
mutate(order = rank(-share)) %>%
mutate(cluster = factor(cluster, levels = unique(cluster)))
# Bar plot with error bars with and legend titles
data_cluster %>%
ggplot(aes(x = reorder(cluster, order), y = share, fill = academic_status)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.75)) +
# Custom colors for each academic_status
scale_fill_manual(values = c("academic" = "orange", "non-academic" = "skyblue")) +
labs(title = "Cluster Distribution by Academic Status",
x = "Cluster",
y = "Frequency",
fill = "Academic Status") + # Legend Title for fill
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_y_continuous(labels = scales::percent)# logged textlength
data %>%
group_by(academic_status) %>%
get_summary_stats(text_length_log2, type = "mean_sd")## # A tibble: 2 × 5
## academic_status variable n mean sd
## <fct> <fct> <dbl> <dbl> <dbl>
## 1 non-academic text_length_log2 458 6.54 1.30
## 2 academic text_length_log2 605 6.54 1.30
##
## Welch Two Sample t-test
##
## data: text_length by academic_status
## t = 0.26877, df = 967.04, p-value = 0.7882
## alternative hypothesis: true difference in means between group non-academic and group academic is not equal to 0
## 95 percent confidence interval:
## -12.43859 16.38641
## sample estimates:
## mean in group non-academic mean in group academic
## 132.6681 130.6942
# statix version
stat.test <- data %>%
t_test(text_length ~ academic_status, detailed = TRUE) %>%
add_significance()
stat.test## # A tibble: 1 × 16
## estimate estimate1 estimate2 .y. group1 group2 n1 n2 statistic p
## <dbl> <dbl> <dbl> <chr> <chr> <chr> <int> <int> <dbl> <dbl>
## 1 1.97 133. 131. text_l… non-a… acade… 458 605 0.269 0.788
## # ℹ 6 more variables: df <dbl>, conf.low <dbl>, conf.high <dbl>, method <chr>,
## # alternative <chr>, p.signif <chr>
# Create a box-plot
bxp <- ggboxplot(
data, x = "academic_status", y = "text_length",
ylab = "Text Length", xlab = "Groups", add = "jitter"
)
# Add p-value and significance levels
stat.test <- stat.test %>% add_xy_position(x = "academic_status")
bxp +
stat_pvalue_manual(stat.test, tip.length = 0) +
labs(subtitle = get_test_label(stat.test, detailed = TRUE))##
## Welch Two Sample t-test
##
## data: readability_score by academic_status
## t = -2.1874, df = 990.53, p-value = 0.02895
## alternative hypothesis: true difference in means between group non-academic and group academic is not equal to 0
## 95 percent confidence interval:
## -1.39185278 -0.07548526
## sample estimates:
## mean in group non-academic mean in group academic
## 11.14713 11.88080
# t-test
stat.test <- data %>%
t_test(readability_score ~ academic_status, detailed = TRUE) %>%
add_significance()
stat.test## # A tibble: 1 × 16
## estimate estimate1 estimate2 .y. group1 group2 n1 n2 statistic p
## <dbl> <dbl> <dbl> <chr> <chr> <chr> <int> <int> <dbl> <dbl>
## 1 -0.734 11.1 11.9 reada… non-a… acade… 458 605 -2.19 0.0289
## # ℹ 6 more variables: df <dbl>, conf.low <dbl>, conf.high <dbl>, method <chr>,
## # alternative <chr>, p.signif <chr>
# Create a box-plot
bxp <- ggboxplot(
data, x = "academic_status", y = "readability_score",
ylab = "readability_score", xlab = "Groups", add = "jitter"
)
# Add p-value and significance levels
stat.test <- stat.test %>% add_xy_position(x = "academic_status")
bxp +
stat_pvalue_manual(stat.test, tip.length = 0) +
labs(subtitle = get_test_label(stat.test, detailed = TRUE))Chi-square test of independence: interested in testing whether there’s a significant association between the two categorical variables (group and cluster)
##
## Intent-based Harms-based Harms-based narrow Denying/Nonsense
## non-academic 114 182 75 87
## academic 194 192 124 95
# Conduct Chi-square test of independence
chi_test_result <- chisq.test(table)
# Print the results
print(chi_test_result)##
## Pearson's Chi-squared test
##
## data: table
## X-squared = 13.391, df = 3, p-value = 0.003862
The p-value is 0.06, which is slightly above my alpha level of 0.05. This means my result is not significant (Aktueller Stand: 28. März mit ca. 1000 observations).
##
## Welch Two Sample t-test
##
## data: leftright_pred_error by academic_status
## t = 0.1253, df = 781.05, p-value = 0.9003
## alternative hypothesis: true difference in means between group non-academic and group academic is not equal to 0
## 95 percent confidence interval:
## -0.2201679 0.2501912
## sample estimates:
## mean in group non-academic mean in group academic
## 2.150036 2.135025
# t-test
stat.test <- data %>%
t_test(leftright_pred_error ~ academic_status, detailed = TRUE) %>%
add_significance()
stat.test## # A tibble: 1 × 16
## estimate estimate1 estimate2 .y. group1 group2 n1 n2 statistic p
## <dbl> <dbl> <dbl> <chr> <chr> <chr> <int> <int> <dbl> <dbl>
## 1 0.0150 2.15 2.14 leftri… non-a… acade… 362 478 0.125 0.9
## # ℹ 6 more variables: df <dbl>, conf.low <dbl>, conf.high <dbl>, method <chr>,
## # alternative <chr>, p.signif <chr>
# Create a box-plot
bxp <- ggboxplot(
data, x = "academic_status", y = "leftright_pred_error",
ylab = "leftright pred score", xlab = "Groups", add = "jitter"
)
# Add p-value and significance levels
stat.test <- stat.test %>% add_xy_position(x = "academic_status")
bxp +
stat_pvalue_manual(stat.test, tip.length = 0) +
labs(subtitle = get_test_label(stat.test, detailed = TRUE))## Warning: Removed 223 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 223 rows containing missing values or values outside the scale range
## (`geom_point()`).
p <- ggpairs(data, # Data frame
columns = indicators, # Columns
aes(color = educ_cat, # Color by group (cat. variable)
alpha = 0.5),
upper = list(continuous = wrap("cor", size = 2.7)
))
p## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 223 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 223 rows containing missing values
## Warning: Removed 223 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 223 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 223 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 223 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 223 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Create the box plot
p <- ggplot(data, aes(x = educ_cat, y = text_length_log2)) +
geom_point(position = position_jitter(width = 0.2), alpha = 0.5, color = "skyblue") +
geom_boxplot(outliers = FALSE) +
theme_minimal() +
labs(title = "Comparison of Text Length by Education",
x = "Academic Status",
y = "Text Length")
# Print the plot
print(p)# Create the box plot
p <- ggplot(data, aes(x = educ_cat, y = readability_score)) +
geom_point(position = position_jitter(width = 0.2), alpha = 0.5, color = "skyblue") +
geom_boxplot(outliers = FALSE) +
theme_minimal() +
labs(title = "Comparison of Readability by Education",
x = "Academic Status",
y = "Readability Score")
# Print the plot
print(p)# Create the box plot
p <- ggplot(data, aes(x = educ_cat, y = leftright_pred_error)) +
geom_point(position = position_jitter(width = 0.2), alpha = 0.5, color = "skyblue") +
geom_boxplot(outliers = FALSE) +
theme_minimal() +
labs(title = "Comparison of Prediction of Political Orientation by educ_cat",
x = "Education",
y = "Readability Score")
# Print the plot
print(p)## Warning: Removed 223 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 223 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Define the function
plot_density_comparison <- function(data, numeric_var_name, group_var_name = "academic_status") {
# Ensure the variable names are non-empty and exist in the dataframe
if (!numeric_var_name %in% names(data)) {
stop("Numeric variable name does not exist in the dataframe.")
}
if (!group_var_name %in% names(data)) {
stop("Group variable name does not exist in the dataframe.")
}
# Use ggplot to create the density plot
p <- ggplot(data, aes_string(x = numeric_var_name, fill = group_var_name)) +
geom_density(alpha = 0.5) + # Adjust alpha for transparency
labs(x = numeric_var_name, y = "Density") + # Labels
ggtitle(paste("Density of", numeric_var_name, "by", group_var_name)) +
scale_fill_manual(values = c("academic" = "skyblue", "non-academic" = "pink")) + # Customize colors
theme_minimal() # Use a minimal theme for a nicer plot
return(p)
}
# Example usage with your dataframe 'data' and the variable 'readability_score'
plot1 <- plot_density_comparison(data, "text_length_log2")## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# For readability_score
plot2 <- plot_density_comparison(data, "readability_score")
# For leftright_pred_error
plot3 <- plot_density_comparison(data, "leftright_pred_error")
# add type/cluster indicator later
plot1## Warning: Removed 223 rows containing non-finite outside the scale range
## (`stat_density()`).
# Define the function
plot_density_comparison <- function(data, numeric_var_name, group_var_name = "educ_cat") {
# Ensure the variable names are non-empty and exist in the dataframe
if (!numeric_var_name %in% names(data)) {
stop("Numeric variable name does not exist in the dataframe.")
}
if (!group_var_name %in% names(data)) {
stop("Group variable name does not exist in the dataframe.")
}
# Use ggplot to create the density plot
p <- ggplot(data, aes_string(x = numeric_var_name, fill = group_var_name)) +
geom_density(alpha = 0.5) + # Adjust alpha for transparency
labs(x = numeric_var_name, y = "Density") + # Labels
ggtitle(paste("Density of", numeric_var_name, "by", group_var_name)) +
scale_fill_manual(values = c("High" = "skyblue", "Intermediate" = "lightgreen", "Low" = "pink")) + # Customize colors
theme_minimal() # Use a minimal theme for a nicer plot
return(p)
}
# Example usage with your dataframe 'data' and the variable 'text_length_log2'
plot5 <- plot_density_comparison(data, "text_length_log2")
# For readability_score
plot6 <- plot_density_comparison(data, "readability_score")
# For leftright_pred_error
plot7 <- plot_density_comparison(data, "leftright_pred_error")
# add type/cluster indicator later
plot5## Warning: Removed 223 rows containing non-finite outside the scale range
## (`stat_density()`).
DID NOT WORK YET!!
#library(patchwork)
#
## Assuming plot1, plot2, plot3, plot5, plot6, plot7 are your ggplot objects
#plot_layout <- plot1 + plot2 + plot3 + plot5 + plot6 + plot7 +
# plot_layout(guides = "collect") &
# theme(legend.position = "bottom")
#
#plot_layout <- plot1 + plot2 + plot3 + plot5 + plot6 + plot7 +
# plot_layout(ncol = 2) # This is the correct usage to specify layout options
#
#
#plot_layout